""" after running, a rest is needed"""
!pip install dython
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots
from google.colab import files
from google.colab import drive
from sklearn.utils import resample
from imblearn import under_sampling
from imblearn import over_sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from dython.nominal import associations
from scipy.stats import norm
import math
import matplotlib.pyplot as plt
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import KFold
#drive.mount('drive', force_remount=True)
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/bank-full.csv", sep=";")
#df = pd.read_csv("/content/bank-full.csv", sep=";")
df.head()
df.info()
yplt = df['y']
df['y']=df['y'].map({'yes':1,'no':0})
df['pdays']=df['pdays'].replace(-1,0)
df
print(df.isnull().sum())
print('\nThe dataset shape is', df.shape)
df.describe()
correlation= associations(df, filename= 'df_correlation.png', figsize=(12,11))
fig = make_subplots(rows=2, cols=2,
subplot_titles=["Target (Y)","Housing","Default", "Loan"], shared_xaxes=True, vertical_spacing=0.15)
fig.add_trace(go.Histogram(x=yplt, name='Target', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"), row=1, col=1)
fig.add_trace(go.Histogram(x=df.housing, name='Housing', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"),row=1, col=2)
fig.add_trace(go.Histogram(x=df.default, name='Default', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"),row=2, col=1)
fig.add_trace(go.Histogram(x=df.loan, name='Loan', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"), row=2, col=2)
fig.update_layout(height=450, width=700, title='Boolean Variables', title_x=0.5, showlegend= False)
fig2 = make_subplots(rows=2, cols=2,
subplot_titles=["Education", "Marital", "Contact", "pOutcome"])
fig2.add_trace(go.Histogram(x=df.education, name='Education', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"),row=1, col=1)
fig2.add_trace(go.Histogram(x=df.marital, name='Marital', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"),row=1, col=2)
fig2.add_trace(go.Histogram(x=df.contact, name='Contact', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"), row=2, col=1)
fig2.add_trace(go.Histogram(x=df.poutcome, name='pOutcome', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"), row=2, col=2)
fig2.update_layout(height=450, width=1000, title='Categorical Variables', title_x=0.5, showlegend= False)
fig2 = make_subplots(rows=2, cols=1,
subplot_titles=["Job", "Month"])
fig2.add_trace(go.Histogram(x=df.job, name='Job', texttemplate= "%{y}", marker_color = "rgb(62,130,171)" ), row=1, col=1)
fig2.add_trace(go.Histogram(x=df.month, name='Month', texttemplate= "%{y}", marker_color = "rgb(62,130,171)"),row=2, col=1)
fig2.update_layout(height=750, width=900, title='Categorical Variables', title_x=0.5, showlegend= False)
fig3 = make_subplots(rows=2, cols=3,
subplot_titles=["Age", "Campaign", "Previous", "Duration", "pDays"])
fig3.add_trace(go.Histogram(x=df.age, name='Age', marker_color = "rgb(62,130,171)"),row=1, col=1)
fig3.add_trace(go.Histogram(x=df.duration, name='Duration', marker_color = "rgb(62,130,171)"),row=2, col=1)
fig3.add_trace(go.Histogram(x=df.campaign, name='Campaign', marker_color = "rgb(62,130,171)"), row=1, col=2)
fig3.add_trace(go.Histogram(x=df.pdays, name='pDays', marker_color = "rgb(62,130,171)"), row=2, col=2)
fig3.add_trace(go.Histogram(x=df.previous, name='Previous', marker_color = "rgb(62,130,171)"),row=1, col=3)
fig3.update_layout(height=750, width=1000, title='Histograms of Categorical Variables', title_x=0.5, showlegend = False)
sns.pairplot(data=df, hue="y");
fig4 = make_subplots(rows=2, cols=3,
subplot_titles=["Age", "Balance", "Duration", "Day", "pDays", "Campaign", "Previous"])
fig4.add_trace(go.Box(
y=df["age"],
name="Age",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=1)
fig4.add_trace(go.Box(
y=df["balance"],
name="Balance",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=2)
fig4.add_trace(go.Box(
y=df["duration"],
name="Duration",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=3)
fig4.add_trace(go.Box(
y=df["day"],
name="Day",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=1)
fig4.add_trace(go.Box(
y=df["pdays"],
name="pDays",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=2)
fig4.add_trace(go.Box(
y=df["campaign"],
name="Campaign",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=3)
fig4.update_layout(height=750, width=1000, title='Boxplots of Selected Columns', title_x=0.5)
It can be seen that many of the numeric features are right skewed and this resulted also in having many outliers. Hence, exponentional is used to reduce the skewedness.
lg_tr = FunctionTransformer(np.log1p,validate=True)
lg_tr.fit(df[['pdays']])
df['pdays'] = lg_tr.transform(df[['pdays']])
#lg_tr.fit(df[['balance']])
#df['balance'] = lg_tr.transform(df[['balance']])
lg_tr.fit(df[['duration']])
df['duration'] = lg_tr.transform(df[['duration']])
lg_tr.fit(df[['campaign']])
df['campaign'] = lg_tr.transform(df[['campaign']])
lg_tr.fit(df[['previous']])
df['previous'] = lg_tr.transform(df[['previous']])
fig, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(df.corr(), annot= True, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
sns.pairplot(data=df, hue="y");
fig4 = make_subplots(rows=2, cols=3,
subplot_titles=["Age", "Balance", "Duration", "Day", "pDays", "Campaign", "Previous"])
fig4.add_trace(go.Box(
y=df["age"],
name="Age",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=1)
fig4.add_trace(go.Box(
y=df["balance"],
name="Balance",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=2)
fig4.add_trace(go.Box(
y=df["duration"],
name="Duration",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=1, col=3)
fig4.add_trace(go.Box(
y=df["day"],
name="Day",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=1)
fig4.add_trace(go.Box(
y=df["pdays"],
name="pDays",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=2)
fig4.add_trace(go.Box(
y=df["campaign"],
name="Campaign",
boxpoints='outliers', # only outliers
marker_color='rgb(107,174,214)',
line_color='rgb(107,174,214)'
), row=2, col=3)
fig4.update_layout(height=750, width=1000, title='Boxplots of Selected Columns', title_x=0.5)
selection = df
selection['job'].replace({'technician':0, 'admin.':1, 'blue-collar':2, 'services':3, 'management':4,
'unknown':5, 'unemployed':6, 'student':7, 'entrepreneur':8, 'retired':9,
'self-employed':10, 'housemaid':11}, inplace=True)
selection['housing'].replace({'no':0, 'yes':1}, inplace=True)
selection['contact'].replace({'unknown':0, 'cellular':1,'telephone':2}, inplace=True)
selection['month'].replace({'jan':1,'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}, inplace=True)
selection['poutcome'].replace({'unknown':0, 'failure':1, 'success':2, 'other':3}, inplace=True)
selection['marital'].replace({'divorced':0, 'married':1, 'single':2}, inplace=True)
selection['education'].replace({'primary':0, 'secondary':1, 'tertiary':2, 'unknown':3}, inplace=True)
selection['pdays']=df['pdays'].replace(-1,0)
# bins = [0, 0.5, 1000]
# labels = ['not_contacted', 'contacted']
# df['pdays'] = pd.cut(df['pdays'], bins, labels = labels,include_lowest = True)
#df['pdays'].replace({'not_contacted':0, 'contacted':1}, inplace=True)
selection['default'].replace({'no':0, 'yes':1}, inplace=True)
selection['loan'].replace({'no':0, 'yes':1}, inplace=True)
X = selection.iloc[:,0:16]
y = selection.iloc[:,-1]
etc = ExtraTreesClassifier()
etc.fit(X,y)
importances = etc.feature_importances_
importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
importances = importances.sort_values('importance')
ax = importances.plot(kind='barh', x='feature', legend=False)
ax.set_ylabel('')
ax.set_title('Variable importance plot from ExtraTreesClassifier')
plt.show()
rfa = RandomForestClassifier()
rfa.fit(X, y)
importances = rfa.feature_importances_
importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
importances = importances.sort_values('importance')
ax = importances.plot(kind='barh', x='feature', legend=False)
ax.set_ylabel('')
ax.set_title('Variable importance plot from Random forest')
plt.show()
selection1 = selection.drop(['balance'] , axis = 'columns')
X = selection1.iloc[:,0:15]
y = selection1.iloc[:,-1]
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(10,'Score')) #print 10 best features
From the above the analysis the following features will be dropped
df = df.drop(['marital', 'default', 'loan', 'contact', 'day', 'pdays', 'previous'], axis = 'columns')
df
"""df['job'].replace({'technician':0, 'admin.':1, 'blue-collar':2, 'services':3, 'management':4,
'unknown':5, 'unemployed':6, 'student':7, 'entrepreneur':8, 'retired':9,
'self-employed':10, 'housemaid':11}, inplace=True)
df['housing'].replace({'no':0, 'yes':1}, inplace=True)
df['contact'].replace({'unknown':0, 'cellular':1,'telephone':2}, inplace=True)
df['month'].replace({'jan':1,'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}, inplace=True)
df['poutcome'].replace({'unknown':0, 'failure':1, 'success':2, 'other':3}, inplace=True)
df['marital'].replace({'divorced':0, 'married':1, 'single':2}, inplace=True)
df['education'].replace({'primary':0, 'secondary':1, 'tertiary':2, 'unknown':3}, inplace=True)
df['pdays']=df['pdays'].replace(-1,0)
# bins = [0, 0.5, 1000]
# labels = ['not_contacted', 'contacted']
# df['pdays'] = pd.cut(df['pdays'], bins, labels = labels,include_lowest = True)
#df['pdays'].replace({'not_contacted':0, 'contacted':1}, inplace=True)
df['default'].replace({'no':0, 'yes':1}, inplace=True)
df['loan'].replace({'no':0, 'yes':1}, inplace=True)"""
df_temp = df["y"].unique()
df['y'].value_counts()
from sklearn import preprocessing
X = df.iloc[:,0:9]
y = df.iloc[:,-1]
X_num = X[['age', 'balance', 'duration', 'campaign']]
X_cat = X[['job', 'education', 'housing', 'month', 'poutcome']]
#X_cat = X[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
# 'month']]
X_num = (X_num - X_num.mean()) / X_num.std() #standardizing
X_norm = (X_num - X_num.min()) / (X_num.max() - X_num.min()) #normalizing
# X_resample_norm.to_csv('X_resample_norm.csv')
naive = pd.concat([X_num, X_cat, y], axis=1)
#dummies for categorical variables
X_cat = pd.get_dummies(X_cat, columns = ['job', 'education','month','poutcome'],
drop_first = True)
#X_cat['default'].replace({'no':0, 'yes':1}, inplace=True)
#X_cat['loan'].replace({'no':0, 'yes':1}, inplace=True)
X_cat['housing'].replace({'no':0, 'yes':1}, inplace=True)
X_scaled1 = pd.concat([X_norm, X_cat], axis=1)
oversample = RandomOverSampler(sampling_strategy=0.5)
X_over, y_over = oversample.fit_resample(X_scaled1, y)
undersample = RandomUnderSampler(sampling_strategy=0.5)
X_under, y_under = undersample.fit_resample(X_scaled1, y)
X_scaled_under = pd.concat([X_under, y_under], axis=1)
X_scaled = pd.concat([X_num, X_cat, df["y"]], axis=1) #combine numerical & categorical
X_scaled.to_csv('X_scaled.csv', index=False)
X_scaled
X_scaled.columns
#SK-learn
from sklearn.linear_model import LogisticRegression
x_train,x_test,y_train,y_test=train_test_split(X_scaled1,y,train_size=0.75)
model_L = LogisticRegression(random_state = 5)
model_L.fit(x_train, y_train)
pred4 = model_L.predict(x_test)
print(metrics.classification_report(y_test, pred4))
@dataclass
class LogisticRegression:
maxIteration: int
learningRate: float
epsilon: float
#filePath: str
#tolerance: int
#def __post_init__(self):
# self.dataReader()
# self.X_train = self.addX0(self.X_train)
# self.X_test = self.addX0(self.X_test)
def addX0(self, X):
return np.column_stack([np.ones(X.shape[0]), X])
def dataReader(self):
train_df = pd.read_csv(self.filePath)
train_df.drop(columns = train_df.columns[0], axis = 1, inplace= True)
#train_df = pd.read_excel(self.filePath)
X_train, X_test, y_train, y_test = train_test_split(train_df[train_df.columns[:-1]], train_df[train_df.columns[-1]], test_size=0.3, random_state=1)
self.X_train, self.X_test, self.y_train, self.y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
return
def sigmoid(self, z):
sig = 1/(1 + np.exp(-z))
return sig
def predict(self, X):
sig = self.sigmoid(X.dot(self.w))
return X.dot(self.w)
def costFunction(self, X, y):
#pred_ = np.log(np.ones(X.shape[0]) + np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
#cost = pred_.sum()
# log loss
sig = self.sigmoid(self.predict(X))
loss = y.dot(np.log(sig)) + (1-y).dot(np.log(1-sig))
cost = -loss.sum() #(1/X.shape[0]) *
return cost
def gradient(self, X, y):
sig = self.sigmoid(X.dot(self.w))
grad = (sig - y).dot(X)
return grad
def gradientDescent(self, X, y):
cost_sequences = []
last_cost = float('inf')
for i in tqdm(range(self.maxIteration)):
self.w = self.w - self.learningRate * (self.gradient(X, y) + 0.01 * self.w)
cur_cost = self.costFunction(X, y)
diff = last_cost - cur_cost
last_cost = cur_cost
cost_sequences.append(cur_cost)
if abs(diff) < self.epsilon:
print('The model stopped : Converged')
break
return self.plotCost(cost_sequences)
def evaluate(self, y, y_hat):
accuracy = (y == y_hat).sum()/y.size
fpr, tpr, _ = metrics.roc_curve(y, y_hat)
auc = metrics.roc_auc_score(y, y_hat)
f1 = plt.figure()
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
y = (y == 1)
y_hat = (y_hat == 1)
precision = (y & y_hat).sum() / y_hat.sum()
recall = (y & y_hat).sum()/y.sum()
f_score = 2*(precision * recall)/(precision + recall)
misclassified = (y != y_hat ).sum()
return round(accuracy,3), round(precision,3), round(recall,3), round(f_score,3), misclassified
def fit(self, X, y):
print('Solving using Gradient Descent')
X = self.addX0(X)
self.w = np.ones(X.shape[1], dtype = np.float64) * 0
self.gradientDescent(X, y)
y_hat = self.sigmoid(self.predict(X))
y_hat = np.round(y_hat)
accuracy, precision, recall, f_score, misclassified = self.evaluate(y, y_hat)
#print('Evaluation for training data : \n')
#print('Accuracy :', accuracy)
#print('Precision :', precision)
#print('Recall :', recall)
#print('F-Score :',f_score)
return pd.DataFrame({
"Accuracy": [accuracy],
"Precision":[precision],
"Recall": [recall],
"f1_score": [f_score],
"Misclassified": [misclassified]
})
def prediction(self, X, y):
X = self.addX0(X)
y_hat = self.sigmoid(self.predict(X))
y_hat = np.round(y_hat)
accuracy, precision, recall, f_score, misclassified = self.evaluate(y, y_hat)
#print('Evaluation for testing data : \n')
#print('Accuracy :', accuracy)
#print('Precision :', precision)
#print('Recall :', recall)
#print('F-Score :',f_score)
return pd.DataFrame({
"Accuracy": [accuracy],
"Precision":[precision],
"Recall": [recall],
"f1_score": [f_score],
"Misclassified": [misclassified]
})
def plotCost(self, error_sequences):
s = np.array(error_sequences)
t = np.arange(s.size)
fig, ax = plt.subplots()
ax.plot(t, s)
ax.set(xlabel='Iteration', ylabel='Error')
def plot(self):
plt.figure(figsize=(12, 8))
ax = plt.axes(projection='3d')
# Data for three-dimensional scattered points
ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1],
self.sigmoid(self.X_train.dot(self.w)),
c = self.y_train[:], cmap='viridis', s=100);
ax.set_xlim3d(55, 80)
ax.set_ylim3d(80, 240)
plt.xlabel('$x_1$ feature', fontsize=15)
plt.ylabel('$x_2$ feature', fontsize=15, )
ax.set_zlabel('$P(Y = 1|x_1, x_2)$', fontsize=15, rotation = 0)
def scatterPlt(self):
# evenly sampled points
x_min, x_max = 55, 80
y_min, y_max = 80, 240
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
np.linspace(y_min, y_max, 250))
grid = np.c_[xx.ravel(), yy.ravel()]
probs = grid.dot(self.w).reshape(xx.shape)
f, ax = plt.subplots(figsize=(14,12))
ax.contour(xx, yy, probs, levels=[0.5], cmap="Greys", vmin=0, vmax=.6)
ax.scatter(self.X_train[:, 0], self.X_train[:, 1],
c=self.y_train[:], s=50,
cmap="RdBu", vmin=-.2, vmax=1.2,
edgecolor="white", linewidth=1)
plt.xlabel('x1 feature')
plt.ylabel('x2 feature')
def plot3D(self):
# evenly sampled points
x_min, x_max = 55, 80
y_min, y_max = 80, 240
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
np.linspace(y_min, y_max, 250))
grid = np.c_[xx.ravel(), yy.ravel()]
probs = grid.dot(self.w).reshape(xx.shape)
fig = plt.figure(figsize=(14,12))
ax = plt.axes(projection='3d')
ax.contour3D(xx, yy, probs, 50, cmap='binary')
ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1],
c=self.y_train[:], s=50,
cmap="RdBu", vmin=-.2, vmax=1.2,
edgecolor="white", linewidth=1)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('probs')
ax.set_title('3D contour')
plt.show()
def getMaxIteration(self):
return self.maxIteration
def setMaxIteration(self, maxIteration):
self.maxIteration = maxIteration
return
def getX_train(self):
return self.X_train
lr = LogisticRegression(epsilon = 0.00005,learningRate=0.1, maxIteration=10000)
results = pd.DataFrame({ })
index = np.array(["train", "test"] * 5)
kf = KFold(n_splits=5)
over_sample = RandomOverSampler(sampling_strategy=0.2)
under_sample = RandomUnderSampler(sampling_strategy=0.4)
X = np.array(X_scaled.iloc[:,0:9])
y = np.array(X_scaled.iloc[:,-1])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, y_train = over_sample.fit_resample(X_train, y_train)
X_train, y_train = under_sample.fit_resample(X_train, y_train)
results = results.append(lr.fit(X_train, y_train))
results = results.append(lr.prediction(X_test, y_test))
results.reset_index(drop = True)
results['Data'] = index
#Taking the average of the Kfolds
results.groupby('Data').mean().sort_values("Data", ascending=False)
lr = LogisticRegression(epsilon = 0.00005, learningRate=1, maxIteration=10000)
results = pd.DataFrame({ })
index = np.array(["train", "test"] * 5)
kf = KFold(n_splits=5)
over_sample = RandomOverSampler(sampling_strategy=0.2)
under_sample = RandomUnderSampler(sampling_strategy=0.4)
X = np.array(X_scaled.iloc[:,0:9])
y = np.array(X_scaled.iloc[:,-1])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, y_train = over_sample.fit_resample(X_train, y_train)
X_train, y_train = under_sample.fit_resample(X_train, y_train)
results = results.append(lr.fit(X_train, y_train))
results = results.append(lr.prediction(X_test, y_test))
results.reset_index(drop = True)
results['Data'] = index
#Taking the average of the Kfolds
results.groupby('Data').mean().sort_values("Data", ascending=False)
lr = LogisticRegression(epsilon = 0.00005,learningRate=0.0001, maxIteration=10000)
results = pd.DataFrame({ })
index = np.array(["train", "test"] * 5)
kf = KFold(n_splits=5)
over_sample = RandomOverSampler(sampling_strategy=0.2)
under_sample = RandomUnderSampler(sampling_strategy=0.4)
X = np.array(X_scaled.iloc[:,0:9])
y = np.array(X_scaled.iloc[:,-1])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, y_train = over_sample.fit_resample(X_train, y_train)
X_train, y_train = under_sample.fit_resample(X_train, y_train)
results = results.append(lr.fit(X_train, y_train))
results = results.append(lr.prediction(X_test, y_test))
results.reset_index(drop = True)
results['Data'] = index
#Taking the average of the Kfolds
results.groupby('Data').mean().sort_values("Data", ascending=False)
lr = LogisticRegression(epsilon = 0.00005,learningRate=0.00001, maxIteration=10000)
results = pd.DataFrame({ })
index = np.array(["train", "test"] * 5)
kf = KFold(n_splits=5)
over_sample = RandomOverSampler(sampling_strategy=0.2)
under_sample = RandomUnderSampler(sampling_strategy=0.4)
X = np.array(X_scaled.iloc[:,0:9])
y = np.array(X_scaled.iloc[:,-1])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, y_train = over_sample.fit_resample(X_train, y_train)
X_train, y_train = under_sample.fit_resample(X_train, y_train)
results = results.append(lr.fit(X_train, y_train))
results = results.append(lr.prediction(X_test, y_test))
results.reset_index(drop = True)
results['Data'] = index
#Taking the average of the Kfolds
results.groupby('Data').mean().sort_values("Data", ascending=False)
from dataclasses import dataclass
from sklearn import metrics
def dataprocessing(df):
X_train, X_test, y_train, y_test = train_test_split(
df[df.columns[:-1]], df[df.columns[-1]],test_size=0.2, random_state=0)
over_sample = RandomOverSampler(sampling_strategy=0.3)
under_sample = RandomUnderSampler(sampling_strategy=1)
X_train, y_train = over_sample.fit_resample(X_train, y_train)
X_train, y_train = under_sample.fit_resample(X_train, y_train)
return X_train, X_test, y_train, y_test
class NaiveBayes:
def __init__(self, X, y):
self.X_train =X
self.y_train =y
def calprior(self):
classes = list(self.y_train.unique())
prior = []
for i in classes:
prior.append(len(self.y_train[self.y_train == i])/len(self.y_train))
return prior
def likelihood(self, feat_name, feat_val, label):
df = self.X_train[self.y_train == label]
if type(feat_val) != str:
x = self.fitDistribution(df[feat_name])
x = x.pdf(feat_val)
return x
if type(feat_val) == str:
x = (len(df[df[feat_name]==feat_val])+ 1) / (len(df)+8)
return x
def fitDistribution(self, data):
mean = np.mean(data)
std = np.std(data)
dist = norm(mean, std)
return dist
def fit(self):
features = list(self.X_train.columns)
prior = self.calprior()
y_pred = []
for x, y in zip(np.array(self.X_train), np.array(self.y_train)):
labels = list(self.y_train.unique())
likelihood = [1]*len(labels)
for j in range(len(labels)):
for i in range(len(features)):
likelihood[j] *= self.likelihood(features[i], x[i], labels[j])
post_prob = [1]*len(labels)
for j in range(len(labels)):
post_prob[j] = (likelihood[j] * prior[j])/(likelihood[j] * prior[j] + likelihood[(1-j)] * prior[(1-j)])
y_pred.append(np.argmax(post_prob))
y_pred = np.array(y_pred)
y_train = np.array(self.y_train)
accuracy, precision, recall, f_score, misclassified = self.evaluate(y_train, y_pred)
print('f1_score:',f_score)
print('Accuracy :', accuracy)
print('Precision :', precision)
print('Recall :', recall)
print('Misclassified', misclassified)
return pd.DataFrame({
"Accuracy": [accuracy],
"Precision":[precision],
"Recall": [recall],
"f1_score": [f_score],
"Misclassified": [misclassified]})
def predict(self, X, y):
features = list(X.columns)
y_pred = []
prior = self.calprior()
for x, k in zip(np.array(X), np.array(y)):
labels = list(np.unique(y))
likelihood = [1]*len(labels)
for j in range(len(labels)):
for i in range(len(features)):
likelihood[j] *= self.likelihood(features[i], x[i], labels[j])
post_prob = [1]*len(labels)
for j in range(len(labels)):
post_prob[j] = (likelihood[j] * prior[j])/(likelihood[j] * prior[j] + likelihood[(1-j)] * prior[(1-j)])
y_pred.append(np.argmax(post_prob))
y_pred = np.array(y_pred)
y_test = np.array(y)
accuracy, precision, recall, f_score, misclassified = self.evaluate(y_test, y_pred)
print('f1_score:',f_score)
print('Accuracy :', accuracy)
print('Precision :', precision)
print('Recall :', recall)
print('Misclassified', misclassified)
return pd.DataFrame({
"Accuracy": [accuracy],
"Precision":[precision],
"Recall": [recall],
"f1_score": [f_score],
"Misclassified": [misclassified]})
def evaluate(self, y, y_hat):
accuracy = (y == y_hat).sum()/y.size
fpr, tpr, _ = metrics.roc_curve(y, y_hat)
auc = metrics.roc_auc_score(y, y_hat)
f1 = plt.figure()
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
y = (y == 1)
y_hat = (y_hat == 1)
precision = (y & y_hat).sum()/y_hat.sum()
recall = (y & y_hat).sum()/y.sum()
f_score = 2*(precision * recall)/(precision + recall)
misclassified = (y != y_hat ).sum()
return round(accuracy,3), round(precision,3), round(recall,3), round(f_score,3), misclassified \
nb = NaiveBayes(X_train,y_train)
X_train, X_test, y_train, y_test = dataprocessing(naive)
nb = NaiveBayes(X_train,y_train)
results = pd.DataFrame({ })
results = results.append(nb.fit())
results = results.append(nb.predict(X_test, y_test))
#original-normalized- learning rate = 0.001
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_scaled1,y,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.001)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#original-normalized- learning rate = 0.01
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_scaled1,y,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.01)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#undersampled-normalized- learning rate = 0.001
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_under,y_under,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.001)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#undersampled-normalized- learning rate = 0.01
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_under,y_under,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.01)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#undersampled-normalized- learning rate = 0.1
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_under,y_under,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.1)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#oversampled-normalized- learning rate = 0.001
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_over,y_over,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.001)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#oversampled-normalized- learning rate = 0.01
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_over,y_over,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.01)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#oversampled-normalized- learning rate = 0.1
from sklearn.neural_network import MLPClassifier
x_train,x_test,y_train,y_test = train_test_split(X_over,y_over,train_size=0.75)
nn = MLPClassifier(random_state = 1, activation= 'logistic', learning_rate_init=0.1)
nn.fit(x_train, y_train)
pred5 = nn.predict(x_test)
print(metrics.classification_report(y_test, pred5))
print(metrics.plot_roc_curve(nn, x_test, y_test))
#SVM
class KernelSvmClassifier:
def __init__(self, C, kernel):#initialized a function here->GRBF
self.C = C
self.kernel = kernel # <---
self.alpha = None
self.supportVectors = None
def fit(self, X, y):
N = len(y)
# --->
# Gram matrix of h(x) y
hXX = np.apply_along_axis(lambda x1 : np.apply_along_axis(lambda x2: self.kernel(x1, x2), 1, X),
1, X)
print("1")
yp = y.reshape(-1, 1)
GramHXy = hXX * np.matmul(yp, yp.T)
# <---
print("2")
# Lagrange dual problem
def Ld0(G, alpha):
return alpha.sum() - 0.5 * alpha.dot(alpha.dot(G))
print("3")
# Partial derivate of Ld on alpha
def Ld0dAlpha(G, alpha):
return np.ones_like(alpha) - alpha.dot(G)
print("4")
# Constraints on alpha of the shape :
# - d - C*alpha = 0
# - b - A*alpha >= 0
A = np.vstack((-np.eye(N), np.eye(N))) # <---
b = np.hstack((np.zeros(N), self.C * np.ones(N))) # <---
constraints = ({'type': 'eq', 'fun': lambda a: np.dot(a, y), 'jac': lambda a: y},
{'type': 'ineq', 'fun': lambda a: b - np.dot(A, a), 'jac': lambda a: -A})
print("5")
# Maximize by minimizing the opposite
optRes = optimize.minimize(fun=lambda a: -Ld0(GramHXy, a),
x0=np.ones(N),
method='SLSQP',
jac=lambda a: -Ld0dAlpha(GramHXy, a),
constraints=constraints)
self.alpha = optRes.x
print("6")
# --->
epsilon = 1e-8
supportIndices = self.alpha > epsilon
self.supportVectors = X[supportIndices]
self.supportAlphaY = y[supportIndices] * self.alpha[supportIndices]
print("7")
# <---
def predict(self, X):
""" Predict y values in {-1, 1} """
# --->
def predict1(x):
x1 = np.apply_along_axis(lambda s: self.kernel(s, x), 1, self.supportVectors)
x2 = x1 * self.supportAlphaY
return np.sum(x2)
d = np.apply_along_axis(predict1, 1, X)
return 2 * (d > 0) - 1
# <---
X_scaled_under['y']=X_scaled_under['y'].map({1:1,0:-1})
X_svm=X_scaled_under.copy()
X_svm.head()
#Drop 'y' column and split the dataset
X = X_svm.drop(['y'], axis = 'columns')
y = X_svm.y
train, test, ytrain, ytest = train_test_split(X, y, test_size= 0.8)
#RBF Kernel function,will be called for each data point
def GRBF(x1, x2):
diff = x1 - x2
return np.exp(-np.dot(diff, diff) * len(x1) / 2)
SVM_RGB = KernelSvmClassifier(C=70, kernel=GRBF)#we can call different kernels for different instances of the class
from scipy import optimize
SVM_RGB.fit(np.array(train), np.array(ytrain))
SVM_RGB.supportVectors
predicted=SVM_RGB.predict(test)
print(predicted)
print(ytest)
#Getting perfomance metrics,accuracy
metrics.accuracy_score(ytest,pd.DataFrame(data=predicted))
metrics.precision_score(ytest,pd.DataFrame(data=predicted))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(ytest,pd.DataFrame(data=predicted)))
disp.plot()
plt.show()
metrics.recall_score(ytest,pd.DataFrame(data=predicted))